#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "08_analysis.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---


# load packages
library(MASS)            # CRAN v7.3-60
library(dplyr)           # CRAN v1.1.2
library(scales)          # CRAN v1.2.1
library(ggplot2)         # CRAN v3.4.2
library(broom)           # CRAN v1.0.5
library(texreg)          # CRAN v1.38.6
library(tidyr)           # CRAN v1.3.0
library(forcats)         # CRAN v1.0.0
library(xtable)          # CRAN v1.8-4
library(stringr)         # CRAN v1.5.0
library(Hmisc)           # CRAN v5.1-0
library(fixest)          # CRAN v0.11.1
library(marginaleffects) # CRAN v0.16.0
library(clarify)         # CRAN v0.2.0
library(sandwich)        # CRAN v3.0-2
library(readstata13)     # CRAN v0.10.1
library(modelsummary)    # CRAN v1.4.1

# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/


# print output of sessionInfo()
sessionInfo()

# load custom ggplot2 scheme
source("function_theme_base.R")

# load dataset for regression analysis, 
# which also includes the Wordfish estimates
# from the previous script
dat_reg <- readRDS("data_regression_bert_wf.rds")

# number of unique manifestos
length(unique(dat_reg$filename))

# number of unique candidates
length(unique(dat_reg$pid))

# number of manifestos per electin

dat_manifestos_election <- dat_reg |> 
    dplyr::select(filename, year) |> 
    unique() |> 
    group_by(year) |> 
    count()
dat_manifestos_election

# check if any candidates with missing values in 
# core variables are included (is not the case)
dat_reg_check <- dat_reg |> 
    dplyr::select(filename, year, 
           type_elected, won_before_times_max9,
           female, dynasty, name_jp, pid,
           party_en, 
           distance_abs_wf_mean) |> 
    unique()

# make sure number of incomplete cases is 0
dat_reg_check %>% 
    filter(!complete.cases(.)) |> 
    nrow()



# define factor levels for positions 
levels_positions <- c("Ministerial Post",
                      "Party Policy Division Post",
                      "Committee Post",
                      "Cabinet Post",
                      "Legislative Post (Combined)"
)


# Figure 02 ----

# bootstrap average salience (per election)

dat_prop_bootstrapped <- dat_reg |> 
    filter(policy_area_harmonised != "Justice Affairs") |> 
    dplyr::select(year, filename, policy_area_harmonised, area_pekkanen, prop_policyarea_bert_no_na,
                  prop_policyarea_relevant_bert_no_na) |> 
    unique() |> 
    gather(measure, prop, -c("year", "filename", "area_pekkanen", "policy_area_harmonised")) |> 
    group_by(year, measure, area_pekkanen, policy_area_harmonised) |> 
    do(data.frame(rbind(Hmisc::smean.cl.boot(.$prop)))) |> 
    filter(measure == "prop_policyarea_relevant_bert_no_na")


# get averages across elections
dat_props_means <- dat_prop_bootstrapped |> 
    group_by(policy_area_harmonised) |> 
    summarise(mean_prop = mean(Mean, na.rm = TRUE)) |> 
    arrange(-mean_prop)


# reorder factor levels
dat_props_means$policy_area_harmonised <- factor(
    dat_props_means$policy_area_harmonised,
    levels = dat_props_means$policy_area_harmonised)


dat_prop_bootstrapped$policy_area_harmonised <- factor(
    dat_prop_bootstrapped$policy_area_harmonised,
    levels = dat_props_means$policy_area_harmonised
)


dat_prop_bootstrapped <- dat_prop_bootstrapped |> 
    mutate(area_pekkanen = factor(area_pekkanen,
                                   levels = c("Distributive",
                                              "Public Goods",
                                              "High Policy")))


# plot Figure 02
ggplot(dat_prop_bootstrapped, 
       aes(x = forcats::fct_rev(factor(year)), 
                                  y = Mean, colour = area_pekkanen,
                                  shape = area_pekkanen)) +
    geom_point(size = 2) + 
    geom_linerange(aes(ymin = Lower,
                       ymax = Upper)) +
    scale_shape_manual(name = "Policy Area", values = c(17, 16, 15)) +
    scale_colour_manual(name = "Policy Area", values = c("black",
                                                         "#00798c",
                                                         "#d1495b")) +
    facet_wrap(~policy_area_harmonised, ncol = 1) +
    geom_hline(data = dat_props_means, aes(yintercept = mean_prop),
               linetype = "dashed") +
    scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 0.5),
                       breaks = c(seq(0, 0.5, 0.05))) +
    coord_flip() + 
    geom_label(data = dat_props_means, aes(label = paste0(format(round(100 * mean_prop, 1), nsmall = 1),  "%"),
                                           x = 4, y = 0.48),
               inherit.aes = FALSE, hjust = 0, colour = "grey30") +
    labs(x = "Election Year",
         y = "Issue Salience in Manifestos") +
    theme(axis.text.y = element_text(size = 10),
          strip.text = element_text(size = 13))
ggsave("fig_02.pdf",
       width = 9, height = 12)


# Figure 03 and Table 01 ----
# Main analysis: five measures

table(dat_reg$policy_area_harmonised)

# make sure remove Justice Affairs from all analyses 
# (but should have be removed automatically since no information on DV)

dat_reg <- dat_reg |> 
    filter(policy_area_harmonised != "Justice Affairs")

dat_reg_at_least_one <- dat_reg |> 
    filter(type_posts == "Legislative Post (Combined)")

table(dat_reg$type_posts)

type_posts_vector <- c(
    "Legislative Post (Combined)",
    "Cabinet Post", 
    "Committee Post",
    "Party Policy Division Post",
    "Ministerial Post")


# create empty list to store regression model
list_base <- list()

# store predicted probabilities
dat_pred_base <- data.frame()

# store coefficients
dat_coefs_base <- data.frame()


for (i in type_posts_vector) {
    
    lm_type <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                         type_elected  +
                         poly(won_before_times_max9, 2) +
                         
                         female +
                         distance_abs_wf_mean + 
                         dynasty |
                         policy_area_harmonised + 
                         factor(year) + factor(region), 
                     data = filter(dat_reg, type_posts == i),
                     vcov = vcov_cluster("filename"),
                     family = binomial(link = "logit"))
    
    list_base[[i]] <- lm_type
    
    dat_coefs <- broom::tidy(lm_type) 
    
    dat_coefs$type_post <- i
    
    dat_eff <- marginaleffects::plot_predictions(lm_type, draw = FALSE,                                        
                                                 condition = c("prop_policyarea_relevant_bert_no_na"))
    
    dat_eff$type_post <- i
    
    dat_pred_base <- bind_rows(dat_eff, dat_pred_base)
    dat_coefs_base <- bind_rows(dat_coefs, dat_coefs_base)
}


# get predicted probabilities at different values

# no focus on policy area
dat_pred_base |> 
    filter(type_post == "Legislative Post (Combined)") |> 
    filter(between(prop_policyarea_relevant_bert_no_na, 0, 0)) |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na,
                  estimate, conf.low, conf.high)

# roughly 50% of focus
dat_pred_base |> 
    filter(type_post == "Legislative Post (Combined)") |> 
    filter(between(prop_policyarea_relevant_bert_no_na, 0.49, 0.52)) |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na,
                  estimate, conf.low, conf.high)

# entire manifesto on one policy area
dat_pred_base |> 
    filter(type_post == "Legislative Post (Combined)") |> 
    filter(between(prop_policyarea_relevant_bert_no_na, 0.99, 1)) |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na,
                  estimate, conf.low, conf.high)

# get data frame with distribution of independent variable
# for plot
port_prop_type <- dat_reg |>
    dplyr::select(type_posts, 
                  prop_policyarea_relevant_bert_no_na) |>
    unique()


dat_pred_base$type_post <- factor(dat_pred_base$type_post,
                                  levels = rev(levels_positions))

ggplot(data = dat_pred_base, aes(x = prop_policyarea_relevant_bert_no_na, 
                                 y = estimate,
                                 ymin = conf.low,
                                 ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_line() +
    geom_rug(data = port_prop_type, aes(x = prop_policyarea_relevant_bert_no_na),
             inherit.aes = FALSE, 
             alpha = 1, 
             length = unit(0.025, "npc"),
             linewidth = 0.1, colour = "grey30") +
    facet_wrap(~type_post, nrow = 1, labeller = label_wrap_gen(width = 18)) +
    labs(x = "Manifesto Salience of Policy Area",
         y = "Pr(Post in Policy Area)")  +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 0.9, 0.3))) +
    scale_y_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 1, 0.2)))
ggsave("fig_03.pdf", width = 9, height = 6)


# Prepare the recoding of regression coefficients
# for all models
list_rename_coefs <- list(
    "count_bert" = "Manifesto Salience (Count of Statements)",
    "prop_policyarea_bert_no_na" = "Manifesto Salience (% of all Statements)",
    "position_dummy_lag1" = "Post in Previous Cycle (lagged DV)",
    "importance_portfolio" =  "Portfolio Importance",
    "prop_policyarea_relevant_bert_no_na" = "Manifesto Salience",
    "prop_policyarea_relevant_bert_no_na:importance_portfolio" = "Manifesto Salience x Portfolio Importance",
    "poly(won_before_times_max9, 2)1" = "Number of Terms",
    "poly(won_before_times_max9, 2)2" = "Number of Terms (squared)",
    "type_electedZombie" = "Elected: Zombie (ref.: SMD)",
    "female1" = "Gender: Female (ref.: Male)",
    "distance_abs_wf_mean" = "Ideological Distance from Party",
    "dynasty" = "Dynasty")


# show regression table
screenreg(list_base,
          custom.coef.map = list_rename_coefs,
          custom.gof.names = c(
              "Num. obs.",
              "Fixed effects: Policy Area",
              "Fixed effects: Election Year",
              "Fixed effects: Region",
              "Deviance", "Log Likelihood", "Pseudo R2"
          ))


# save Table 1
htmlreg(list_base,
        custom.coef.map = list_rename_coefs,
        single.row = TRUE,
        caption = NULL,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_01.html")


# Figure 04 ----

# Predicted probabilities and first differences 
# based on interaction between manifesto salience 
# and broad policy area

# change year to factor
dat_reg$year <- factor(dat_reg$year)

# run logistic regression 
mod_clarifypackage <- glm(position_dummy ~
                              prop_policyarea_relevant_bert_no_na * area_pekkanen +
                              type_elected  +
                              poly(won_before_times_max9, 2) +
                              female +
                              distance_abs_wf_mean + 
                              dynasty  +
                              year + region,
                          data = filter(dat_reg, type_posts == "Legislative Post (Combined)"),
                          family = binomial(link = "logit"))

# print Table A07
modelsummary(list("Model 1" = mod_clarifypackage),
             vcov = vcovCL(mod_clarifypackage, cluster = ~ filename),
             # use clustered standard errors
             estimate = "{estimate} ({std.error}){stars}",
             fmt = 2,
             statistic = NULL,
             stars = c('*' = .05, '**' = .001, '***' = .0001),
             coef_omit = c("year|region|(Intercept)"),
             coef_map = c("prop_policyarea_relevant_bert_no_na" = "Manifesto Salience",
                          "area_pekkanenHigh Policy" = "Area: High Policy (ref.: Distributive)",
                          "area_pekkanenPublic Goods" = "Area: Public Goods",
                          "poly(won_before_times_max9, 2)1" = "Number of Terms",
                          "poly(won_before_times_max9, 2)2" = "Number of Terms (squared)",
                          "type_electedZombie" = "Elected: Zombie (ref.: SMD)",
                          "female1" = "Gender: Female (ref.: Male)",
                          "distance_abs_wf_mean" = "Ideological Distance from Party",
                          "dynasty" = "Dynasty",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenHigh Policy" = "Manifesto Salience x Area: High Policy",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenPublic Goods" = "Manifesto Salience x Area: Public Goods"))


# save Table A07 as Word document
modelsummary(list("Model 1" = mod_clarifypackage), 
             output = "tab_a07.docx",
             vcov = vcovCL(mod_clarifypackage, cluster = ~ filename),
             # use clustered standard errors
             estimate = "{estimate} ({std.error}){stars}",
             fmt = 2,
             statistic = NULL,
             stars = c('*' = .05, '**' = .001, '***' = .0001),
             coef_omit = c("year|region|(Intercept)"),
             coef_map = c("prop_policyarea_relevant_bert_no_na" = "Manifesto Salience",
                          "area_pekkanenHigh Policy" = "Area: High Policy (ref.: Distributive)",
                          "area_pekkanenPublic Goods" = "Area: Public Goods",
                          "poly(won_before_times_max9, 2)1" = "Number of Terms",
                          "poly(won_before_times_max9, 2)2" = "Number of Terms (squared)",
                          "type_electedZombie" = "Elected: Zombie (ref.: SMD)",
                          "female1" = "Gender: Female (ref.: Male)",
                          "distance_abs_wf_mean" = "Ideological Distance from Party",
                          "dynasty" = "Dynasty",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenHigh Policy" = "Manifesto Salience x Area: High Policy",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenPublic Goods" = "Manifesto Salience x Area: Public Goods"))

# simulate coefficients and first differences
# firs set seed for reproducibility
set.seed(123)

# cluster standard errors by filename 
# (as done in all other models by default since feglm is used by defautl)
s <- sim(mod_clarifypackage, n = 1000, 
         vcov = vcovCL(mod_clarifypackage, cluster = ~ filename))

# get predicted probabilities
est_pred <- sim_setx(
    s, x = list(area_pekkanen = c("Distributive", "Public Goods", "High Policy"),
                prop_policyarea_relevant_bert_no_na = seq(0, 1, by = 0.1))
)

# store as data frame
est_pred_df <- as.data.frame(est_pred) |> 
    gather(variable, coefficient) |> 
    separate(variable, into = c("area_pekkanen", "prop_policyarea"), sep = ", ") |> 
    mutate(prop_policyarea = str_remove_all(prop_policyarea, "prop_policyarea_relevant_bert_no_na =")) |> 
    mutate(area_pekkanen = str_remove_all(area_pekkanen, '"')) |> 
    mutate(area_pekkanen = str_remove_all(area_pekkanen, "area_pekkanen = "))

# get averages and confidence intervals of 
# simulated values
est_pred_df_sum <- est_pred_df |> 
    group_by(area_pekkanen, prop_policyarea) |> 
    mutate(prop_policyarea = as.numeric(prop_policyarea)) |> 
    summarise(estimate = mean(coefficient),
              conf.low = quantile(coefficient, 0.025),
              conf.high = quantile(coefficient, 0.975))

# get value for maximum salience
est_pred_df_sum_max <- est_pred_df_sum |> 
    group_by(area_pekkanen, prop_policyarea) |> 
    filter(prop_policyarea == 1)

# get distribution of manifesto salience across 
# broad areas for plot
port_prop_pekkanen <- dat_reg |> 
    filter(type_posts == "Legislative Post (Combined)") |> 
    ungroup() |> 
    dplyr::select(area_pekkanen, prop_policyarea = prop_policyarea_relevant_bert_no_na) |> 
    mutate(area_pekkanen = factor(area_pekkanen, 
                                  levels = c("Distributive",
                                             "Public Goods",
                                             "High Policy")))

# reorder factor levels
est_pred_df_sum <- est_pred_df_sum |> 
    mutate(area_pekkanen = factor(area_pekkanen, 
                                  levels = c("Distributive",
                                             "Public Goods",
                                             "High Policy")))

# plot Figure 04
ggplot(est_pred_df_sum, aes(x = prop_policyarea, 
                            y = estimate,
                            ymin = conf.low,
                            ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_rug(data = port_prop_pekkanen, aes(x = prop_policyarea),
             inherit.aes = FALSE, 
             alpha = 1, 
             length = unit(0.025, "npc"),
             linewidth = 0.1, colour = "grey10") +
    geom_line() +
    facet_wrap(~area_pekkanen) +
    labs(x = "Manifesto Salience of Policy Area",
         y = "Pr(Post in Policy Area)")  +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 1, 0.2))) +
    scale_y_continuous(limits = c(0, 1.05),
                       breaks = c(seq(0, 1, 0.2))) +
    theme(strip.text = element_text(size = 14))
ggsave("fig_04.pdf", width = 9, height = 5)



# Figure 06 and Table 02 ----

# Use portfolio importance as alternative measure

# create empty list and data frem to store results
list_base_importance <- list()
dat_pred_importance <- data.frame()

# note: do not use policy area fixed effects since 
# covered by importance variable; apart from that
# model specification is identical
for (i in type_posts_vector) {
    lm_type_imp <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  *
                             importance_portfolio + # interact salience and importance
                             type_elected  +
                             poly(won_before_times_max9, 2) +
                             female +
                             distance_abs_wf_mean + 
                             dynasty |
                             factor(year) + factor(region), 
                         data = filter(dat_reg, type_posts == i),
                         vcov = vcov_cluster("filename"),
                         family = binomial(link = "logit"))
    
    list_base_importance[[i]] <- lm_type_imp
    
    dat_pred <- marginaleffects::plot_predictions(lm_type_imp, draw = FALSE,                                             condition = c("importance_portfolio",
                                                                                                                               "prop_policyarea_relevant_bert_no_na"))
    
    dat_pred$type_post <-  i
    dat_pred_importance <- bind_rows(dat_pred, dat_pred_importance)
}


# print Table 02
screenreg(list_base_importance, custom.coef.map = list_rename_coefs)

# save Table 02 as Word document
htmlreg(list_base_importance,
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_02.html")


# run model with combined measure (=Model 1 of table above)
glm_imp <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  *
                     importance_portfolio +
                     type_elected  +
                     poly(won_before_times_max9, 2) +
                     female +
                     distance_abs_wf_mean + 
                     dynasty |
                     factor(year) + factor(region), 
                 data = filter(dat_reg, type_posts == "Legislative Post (Combined)"),
                 vcov = vcov_cluster("filename"),
                 family = binomial(link = "logit"))


# plot predicted probabilities based on interaction 
# for three values of manifesto importance (0.1, 0.25, 0.5, 0.75)
pred_imp_min_max <- plot_predictions(
    glm_imp,
    draw = FALSE,                                     
    condition = list("importance_portfolio",
                     prop_policyarea_relevant_bert_no_na = c(0.1, 0.25, 0.5, 0.75))) 


pred_imp_min_max_clean <- pred_imp_min_max |> 
    mutate(prop_policyarea_relevant_bert_no_na = paste0("Manifesto Salience=", prop_policyarea_relevant_bert_no_na))

port_imp <- dat_reg |> 
    dplyr::select(importance_portfolio) |> 
    unique()

# Figure 06 
ggplot(pred_imp_min_max_clean, aes(x = importance_portfolio,
                                   y = estimate,
                                   ymin = conf.low,
                                   ymax = conf.high)) +
    geom_ribbon(fill = "grey80") + 
    geom_rug(data = port_imp, aes(x = importance_portfolio),
             inherit.aes = FALSE, 
             length = unit(0.035, "npc"),
             linewidth = 0.8, colour = "grey10") +
    geom_line() + 
    scale_y_continuous(limits = c(0,1)) +
    facet_wrap(~prop_policyarea_relevant_bert_no_na, nrow = 1) +
    labs(x = "Portfolio Importance", y = "Pr(Obtaining Post)") +
    theme(strip.text.x = element_text(size = 12))
ggsave("fig_06.pdf",
       width = 9, height = 5)


# Figure 05 ----
# Separate models for each policy area 

# empty data frame to store coefficients
tidy_area_all <- data.frame()

# remove Ministerial Posts due to low number of obserations
dat_reg_areas <- filter(dat_reg, !type_posts %in% c("Ministerial Post"))

# run model in a loop for each policy area and type of post
for (i in unique(dat_reg_areas$policy_area_type_post)) {
    
    dat_reg_areas_subset <- filter(dat_reg_areas, policy_area_type_post  == i)
    
    dat_reg_areas_subset <- dat_reg_areas_subset |>
        mutate(prop_policyarea_relevant_bert_no_na_stand =
                   (prop_policyarea_relevant_bert_no_na - mean(prop_policyarea_relevant_bert_no_na)) / sd(prop_policyarea_relevant_bert_no_na))
    
    glm_area <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                          type_elected  +
                          poly(won_before_times_max9, 2) +
                          
                          female +
                          distance_abs_wf_mean + 
                          dynasty |
                          factor(year) + factor(region),
                      data = dat_reg_areas_subset,
                      vcov = vcov_cluster("filename"),
                      family = binomial(link = "logit"))
    
    
    # tidy coefficients in data frame
    tidy_area <- broom::tidy(glm_area) |> 
        mutate(model = i)
    
    # bind all models
    tidy_area_all <- bind_rows(tidy_area, tidy_area_all)
    
}

# filter only coefficient for manifesto salience
tidy_area_policyarea <- filter(tidy_area_all, str_detect(term, "prop_policy"))


# separate variable indicating area and type of post
tidy_area_policyarea <- tidy_area_policyarea |> 
    separate(model, into = c("policy_area_harmonised", "type_posts"), sep = "__")

# recode type of policy area (based on Pekkanen et al. 2006)
tidy_area_policyarea <- tidy_area_policyarea |> 
    mutate(area_pekkanen  = 
               case_when(
                   str_detect(policy_area_harmonised, "Agriculture") ~ "Distributive",
                   str_detect(policy_area_harmonised, "Financial") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Security") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Cabinet") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Trade") ~ "Distributive",
                   str_detect(policy_area_harmonised, "Education") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Environment") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Welfare") ~ "Public Goods",
                   str_detect(policy_area_harmonised, "Foreign") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Internal") ~ "High Policy",
                   str_detect(policy_area_harmonised, "Infrastructure") ~ "Distributive"))


# recode factor levels
tidy_area_policyarea_clean <- tidy_area_policyarea |> 
    filter(!type_posts %in% c("Ministerial Post")) |> 
    mutate(area_pekkanen = factor(area_pekkanen, levels = c("Distributive",
                                                            "Public Goods",
                                                            "High Policy"
    ))) |> 
    mutate(type_posts = factor(type_posts, levels = c("Legislative Post (Combined)",
                                                      "Committee Post", "Cabinet Post", "Party Policy Division Post")))


ggplot(tidy_area_policyarea_clean,
       aes(x = reorder(policy_area_harmonised, -estimate), 
           y = estimate,
           ymin = estimate - 1.96 * std.error,
           ymax = estimate + 1.96 * std.error,
           colour = area_pekkanen)) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "black") +
    geom_point(size = 3) + 
    scale_y_continuous(breaks = c(seq(-5, 15, 5))) +
    geom_linerange(linewidth = 0.8) +
    scale_x_discrete(labels = scales::label_wrap(30)) +
    scale_shape_manual(name = "Policy Area", values = c(17, 16, 15)) +
    scale_colour_manual(name = "Policy Area", values = c("black", 
                                                         "#00798c",
                                                         "#d1495b" )) +
    facet_grid(area_pekkanen~type_posts, space = "free_y", scales = "free_y",
               labeller = label_wrap_gen(width = 20)) +
    labs(y = "Coefficient of Manifesto Salience", x = NULL) +
    coord_flip() +
    theme(legend.position = "none", strip.text = element_text(size = 13))
ggsave("fig_05.pdf", width = 9, height = 6)


# Table A01 ----
# Manually created table based on report from election studies
# No code required

# Table A02 ----
# Overview of number of available posts
# No code required

# Table A03 (included in 07_performance_all_classifiers.R) -----

# Figure A01 ----
# Descriptive: number of positions overall and within policy area ---

# check how many politicians held several posts per policy area?

dat_several <- dat_reg |> 
    dplyr::select(position_dummy, type_posts, policy_area_harmonised,
                  female, region, dynasty, distance_abs_wf_mean, #type_elected,
                  prop_policyarea_relevant_bert_no_na, type_elected,
                  won_before_times_max9,
                  year, filename) |> 
    spread(type_posts, position_dummy) |> 
    mutate(`Party Policy Division Post` = ifelse(is.na(`Party Policy Division Post`), 0, `Party Policy Division Post`)) |> 
    mutate(sum_posts_area = `Cabinet Post` + `Committee Post` + `Party Policy Division Post`) |> 
    arrange(-sum_posts_area) |> 
    mutate(one = ifelse(sum_posts_area == 1, 1, 0)) |> 
    mutate(more_than_one = ifelse(sum_posts_area > 1, 1, 0))


# Get number of politicians per cycle with several posts
dat_several_politicians <- dat_several |>
    group_by(filename, year) |> 
    summarise(sum_areas_more_than_one = sum(more_than_one),
              sum_one = sum(one))  
#filter(sum_areas_more_than_one > 0) # only filter on politicians with more than one posts per area

prop.table(table(dat_several_politicians$sum_one))

dat_more_than_one <- dat_several |>
    group_by(filename, year) |> 
    summarise(more_than_one_in_area = sum(more_than_one)) |> 
    mutate(more_than_one_dummy = ifelse(more_than_one_in_area > 0, 1, 0))

dat_more_than_one |> 
    ungroup() |> 
    summarise(mean_more_than_one = mean(more_than_one_dummy),
              n = n())


dat_mps <- dat_reg |> 
    dplyr::select(filename, year) |> 
    unique() |> 
    group_by(year) |> 
    count()

dat_sum_with0 <- dat_several |> 
    group_by(year) |> 
    summarise(mean = mean(more_than_one),
              sum = sum(more_than_one)) |> 
    mutate(Type = "All MPs")

dat_sum_without0 <- dat_several |> 
    filter(sum_posts_area > 0) |> 
    group_by(year) |> 
    summarise(mean = mean(more_than_one),
              sum = sum(more_than_one)) |> 
    mutate(Type = "MPs Holding Posts")


dat_sum_bind <- bind_rows(dat_sum_with0, dat_sum_without0)

dat_sum_bind

dat_sum_bind_avg <- dat_sum_bind |> 
    group_by(Type) |> 
    summarise(mean = mean(mean))

# print data frame since numbers reported in paper
dat_sum_bind_avg

# Get number of posts per MP

dat_positions_sum <- dat_reg |> 
    group_by(id_smith, year, type_posts) |> 
    summarise(sum_posts = sum(position_dummy)) |> 
    group_by(type_posts, sum_posts) |> 
    count() |> 
    group_by(type_posts) |> 
    mutate(prop = n / sum(n)) |> 
    mutate(label = paste0("n=", n, "\n(", 100 * round(prop, 3), "%)")) |> 
    mutate(zero_dummy = ifelse(sum_posts == "0", "1", "0"))



# check how many MPs did not obtain ANY post
# by manually adding up the four types of posts
# since Legislative Post (Combined) does not consider
# minister positions
dat_positions_any <- dat_reg |> 
    filter(type_posts != "Legislative Post (Combined)") |> 
    group_by(id_smith, year) |> 
    summarise(sum_posts = sum(position_dummy)) |> 
    group_by(sum_posts) |> 
    count() |> 
    ungroup() |> 
    mutate(prop = n / sum(n)) |> 
    mutate(type_posts = "All Posts (Combined)") |> 
    mutate(prop = n / sum(n)) |> 
    mutate(label = paste0("n=", n, "\n(", 100 * round(prop, 3), "%)")) |> 
    mutate(zero_dummy = ifelse(sum_posts == "0", "1", "0"))



# define factor levels for positions plot 
levels_positions_extended <- c("Ministerial Post",
                               "Party Policy Division Post",
                               "Committee Post",
                               "Cabinet Post",
                               "Legislative Post (Combined),\nExcluding Ministerial Posts",
                               "All Posts (Combined)"
)


# bind both variables
dat_positions_sum_bind <- bind_rows(dat_positions_sum, 
                                    dat_positions_any) |> 
    mutate(type_posts = ifelse(type_posts == "Legislative Post (Combined)",
                               "Legislative Post (Combined),\nExcluding Ministerial Posts",
                               type_posts)) |> 
    mutate(type_posts = fct_rev(factor(type_posts, levels_positions_extended))) 


ggplot(dat_positions_sum_bind, aes(y = n, x = factor(sum_posts),
                                   fill = zero_dummy)) +
    geom_bar(stat = "identity") +
    facet_grid(type_posts~.) +
    scale_fill_manual(values = c("grey20", "grey70")) +
    scale_y_continuous(limits = c(0, 1600),
                       breaks = c(seq(0, 2000, 400))) +
    geom_text(aes(label = label), 
              nudge_y = 250, hjust = 0.5, size = 4.5) +
    labs(y = "Number of MPs", x = "Number of Positions (Across Policy Areas) During Legislative Cycle") + 
    theme(legend.position = "none",
          strip.text.y = element_text(angle = 360, hjust = 0))
ggsave("fig_a01.pdf",
       width = 10, height = 12)


dat_reg_unique <- dat_reg |> 
    filter(type_posts == "Legislative Post (Combined)")

table(dat_reg_unique$party_en)

dat_reg_unique |> 
    filter(!party_en %in% c("LDP", "DPJ")) |> 
    dplyr::select(filename, party_en, name_jp, pid) |> 
    unique()

table(dat_reg$party_en)


# Figure A02----

dat_imp <- dat_reg |> 
    ungroup() |> 
    dplyr::select(importance_portfolio, year, policy_area_harmonised,
                  area_pekkanen) |> 
    unique() |> 
    filter(!is.na(importance_portfolio)) |> 
    group_by(year) |> 
    mutate(importance_portfolio_3 = ntile(importance_portfolio, 3))


dat_means <- dat_imp |> 
    ungroup() |> 
    group_by(policy_area_harmonised, area_pekkanen) |> 
    summarise(mean = mean(importance_portfolio)) |> 
    arrange(-mean)

# get overall mean and standard deviation
# which are reported in the paper
summary(dat_means$mean)
sd(dat_means$mean)

# get portfolio importance by Pekkanen et al (2006) area,
# reported in paper
dat_imp |> 
    group_by(area_pekkanen) |> 
    summarise(mean = mean(importance_portfolio))


# plot figure
set.seed(23) # set seed for jitter
ggplot(dat_imp, aes(x = reorder(policy_area_harmonised, importance_portfolio),
                    y = importance_portfolio,
                    fill = factor(year),
                    shape = factor(year))) +
    geom_jitter(size = 3, width = 0.1, height = 0) +
    scale_colour_grey(start  = 0.8, end = 0.3) +
    coord_flip() +
    scale_shape_manual(values = c(1, 0, 16, 15, 18)) +
    labs(y = "Portfolio Importance (Expert Surveys)", 
         x = NULL) +
    theme(legend.title = element_blank())
ggsave("fig_a02.pdf",
       width = 9, height = 5)


# Figures A03, A04, A05, A07: either created in other files or no code involved -----


# Table A06 ----
# get number of statements and manifestos

# sum of positions
dat_grouped_pol <- dat_reg |> 
    group_by(filename, year, n_sentences_manifesto, type_posts) |>
    summarise(type_posts_pol = sum(position_dummy))


# load BERT-classified data
dat_predicted <- readr::read_csv("data_predicted_full.csv")

# get number of statements per manifesto (policy and non-policy)
dat_sum_sents <- dat_predicted |> 
    group_by(doc_id) |> 
    count() |> 
    rename(filename = doc_id,
           n_sentences_manifesto = n)

dat_grouped_man <- dat_reg |>
    dplyr::select(filename, year) |> 
    unique()

dat_grouped_man_sents <- left_join(dat_grouped_man, dat_sum_sents,
                             by = "filename")

dat_grouped_man_clean <- dat_grouped_man_sents |>
    mutate(Year = year) |> 
    group_by(Year) |> 
    summarise(Manifestos = length(unique(filename)),
              Statements = sum(n_sentences_manifesto, na.rm = TRUE),
              `Statements (Mean)` = mean(n_sentences_manifesto, na.rm = TRUE),
              `Statements (Median)` = as.integer(median(n_sentences_manifesto, na.rm = TRUE)),
              `Statements (SD)` = sd(n_sentences_manifesto, na.rm = TRUE))

# get number of manifestos and statements (reported in abstract and paper)

sum(dat_grouped_man_clean$Manifestos)
sum(dat_grouped_man_clean$Statements)

# load Smith and Reed dataset
dat_reed_smith_subset <- readstata13::read.dta13("Reed-Smith-JHRED-CANDIDATES.dta") |> 
    filter(result %in% c("1", "2", "3")) |> 
    filter(year > 2001) |> 
    filter(party_en == "LDP" & year == "2003" |
               party_en == "LDP" & year == "2005" |
               party_en == "DPJ" & year == "2009" |
               party_en == "LDP" & year == "2012" |
               party_en == "LDP" & year == "2014")

table(dat_reed_smith_subset$party_en,
      dat_reed_smith_subset$year)

# Get overview of available candidates
dat_man_available <- read.csv("data_candidates_available.csv",
                              fileEncoding = "utf-8") |> 
    rename(Year = year)


dat_man_available

dat_man_available$Year <- as.character(dat_man_available$Year)
dat_grouped_man_clean$Year <- as.character(dat_grouped_man_clean$Year)

dat_grouped_man_full <- left_join(dat_grouped_man_clean, 
                                  dat_man_available,
                                  by = "Year") |> 
    dplyr::select(Year, Manifestos,
                  `Gov. Party` = party_en,
                  everything()) |> 
    dplyr::select(-available_manifestos) |> 
    mutate(Year = as.integer(Year))#

# print table as data frame
as.data.frame(dat_grouped_man_full)

sum(dat_grouped_man_full$Statements)

# store table
print(xtable(dat_grouped_man_full, digits = 1),
      file = "tab_a06.html",
      type = "html", include.rownames=FALSE)


# Figure A08, and Table A07 ----
# first-differences and regression underlying Figures 4 and A07
# function to get first differences for the three combinations

get_firstdiffs <- function(typ_value) {
    
    est_public_dist <- sim_setx(s, 
                                x = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "Public Goods"),
                                x1 = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "Distributive"),
                                verbose = FALSE)
    
    est_public_dist_df <- as.data.frame(est_public_dist) |> 
        mutate(model = "First Difference: Distributive vs. Public Goods")
    
    est_public_high <- sim_setx(s, x = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "Public Goods"),
                                x1 = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "High Policy"),
                                verbose = FALSE)
    
    est_public_high_df <- est_public_high |> 
        as.data.frame() |>
        mutate(model = "First Difference: High Policy vs. Public Goods")
    
    
    est_high_dist <- sim_setx(s, x = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "High Policy"),
                              x1 = list(prop_policyarea_relevant_bert_no_na = typ_value, area_pekkanen = "Distributive"),
                              verbose = FALSE)
    
    est_high_dist_df <- est_high_dist |> 
        as.data.frame() |>
        mutate(model = "First Difference: Distributive vs. High Policy")
    
    
    dat_firstdiff <- bind_rows(est_public_dist_df, est_public_high_df, est_high_dist_df)
    
    dat_firstdiff_agg <- dat_firstdiff |> 
        group_by(model) |> 
        summarise(estimate = mean(FD),
                  ci_lower = quantile(FD, 0.025),
                  ci_upper = quantile(FD, 0.975))
    
    return(dat_firstdiff_agg)
    
}

# run first-difference analysis for values of manifesto salience
# ranging from 0 to 1 in steps of 0.1
# running the code below might take a few minutes
# due to the large amount of simulations

dat_firstdiffs <- data.frame()

for (i in seq(0, 1, 0.1)) {
    
    cat("First difference estimation for:", i, "\n")
    
    firstdiffs <- get_firstdiffs(typ_value = i) |> 
        mutate(prop_policyarea = i)
    
    dat_firstdiffs <- bind_rows(dat_firstdiffs, firstdiffs)
    
    
}


# clean up model names for plot
dat_firstdiffs_clean <- dat_firstdiffs |> 
    mutate(model = str_replace_all(model, ": ", ":\n"))

# get the values at 0 for plot
dat_firstdiffs_0 <- dat_firstdiffs_clean |> 
    group_by(model) |> 
    mutate(average_fd = mean(estimate)) |> 
    filter(prop_policyarea == 0) |> 
    mutate(higher = case_when(
        str_detect(model, "High Policy vs. Public Goods") ~ "High Policy",
        str_detect(model, "Distributive vs. Public Goods") ~ "Distributive",
        str_detect(model, "Distributive vs. High Policy") ~ "Distributive",
        
        
    )) |> 
    mutate(lower = case_when(
        str_detect(model, "High Policy vs. Public Goods") ~ "Public Goods",
        str_detect(model, "Distributive vs. Public Goods") ~ "Public Goods",
        str_detect(model, "Distributive vs. High Policy") ~ "High Policy",
        
        
    ))


# change factor levels
dat_firstdiffs_clean <- dat_firstdiffs_clean |> 
    mutate(model = factor(model,
                          levels = c("First Difference:\nDistributive vs. Public Goods",
                                     "First Difference:\nDistributive vs. High Policy",
                                     "First Difference:\nHigh Policy vs. Public Goods")))

dat_firstdiffs_0 <- dat_firstdiffs_0 |> 
    mutate(model = factor(model,
                          levels = c("First Difference:\nDistributive vs. Public Goods",
                                     "First Difference:\nDistributive vs. High Policy",
                                     "First Difference:\nHigh Policy vs. Public Goods")))



# plot Figure A08
ggplot(dat_firstdiffs_clean, aes(x = prop_policyarea, y = estimate,
                                 ymin = ci_lower,
                                 ymax = ci_upper)) +
    geom_hline(yintercept = 0, colour = "red", linetype = "dashed") +
    geom_point(size = 3) + 
    geom_linerange(linewidth = 0.8) +
    facet_wrap(~model, nrow = 1) +
    geom_text(data = dat_firstdiffs_0,
              aes(label = paste0("Average FD: ", round(average_fd, 2)),
                  x = 0.98, y = 0.6,
                  hjust = 1),
              size = 4.3,
              colour = "grey30") +
    scale_y_continuous(limits = c(-0.2, 0.6),
                       breaks = c(seq(-0.2, 0.6, 0.2))) +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 1, 0.2))) +
    geom_text(data = dat_firstdiffs_0,
              aes(label = higher),
              x = 0.08, y = 0.45,
              hjust = 0,
              size = 4.3,
              inherit.aes = FALSE, colour = "grey50") +
    geom_text(data = dat_firstdiffs_0,
              aes(label = lower),
              x = 0.08, y = -0.1,
              size = 4.3,
              hjust = 0,
              inherit.aes = FALSE, colour = "grey50") +
    geom_segment(aes(x = 0.05, y = 0.4, xend = 0.05, yend = 0.5),
                 arrow = arrow(length = unit(0.2, "cm")), colour = "grey50") +
    geom_segment(aes(x = 0.05, y = -0.05, xend = 0.05, yend = -0.15),
                 arrow = arrow(length = unit(0.2, "cm")), colour = "grey50") +
    labs(x = "Manifesto Salience of Policy Area", y = "Difference in Predicted Probabilities") +
    theme(strip.text = element_text(size = 14))
ggsave("fig_a08.pdf", width = 9, height = 5)


# Figure A09 and Table A08 ----
# separate models for Pekkanen areas, including models that split the high policy areas


# recode Pekkanen et al. into more fine-grained categories
# by distinguishing between domestic and foreign high policy
dat_reg <- dat_reg |> 
    mutate(area_pekkanen2 = ifelse(policy_area_harmonised %in% c("Internal Affairs and Communications",
                                                                 "Cabinet",
                                                                 "Financial Affairs"), 
                                   "High Policy (Domestic)",
                                   ifelse(policy_area_harmonised %in% c("Foreign Affairs",
                                                                        "Security"),
                                          "High Policy (Foreign)", area_pekkanen)))


# combined measure for comparison
mod_at_least_one <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                              type_elected  +
                              poly(won_before_times_max9, 2) +
                              female +
                              distance_abs_wf_mean +
                              dynasty |
                              policy_area_harmonised +
                              factor(year) + factor(region),
                          data = filter(dat_reg, type_posts == "Legislative Post (Combined)"),
                          vcov = vcov_cluster("filename"),
                          family = binomial(link = "logit"))


# high policy areas
mod_at_least_one_high <- update(
    mod_at_least_one, 
    data = filter(dat_reg, 
                  type_posts == "Legislative Post (Combined)" & 
                      area_pekkanen == "High Policy"))

# distributive policy areas
mod_at_least_one_dist <- update(
    mod_at_least_one, 
    data = filter(dat_reg, 
                  type_posts == "Legislative Post (Combined)" & 
                      area_pekkanen == "Distributive"))


# public goods policy areas
mod_at_least_one_publicgoods <- update(
    mod_at_least_one, 
    data = filter(dat_reg, 
                  type_posts == "Legislative Post (Combined)" & 
                      area_pekkanen == "Public Goods"))

# domestic high policy areas only
mod_at_least_one_high_domestic <- update(
    mod_at_least_one, 
    data = filter(dat_reg, 
                  type_posts == "Legislative Post (Combined)" & 
                      area_pekkanen2 == "High Policy (Domestic)"))

# foreign high policy areas only
mod_at_least_one_high_foreign <- update(
    mod_at_least_one, 
    data = filter(dat_reg, 
                  type_posts == "Legislative Post (Combined)" & 
                      area_pekkanen2 == "High Policy (Foreign)"))

# print regression table
screenreg(list(mod_at_least_one_dist,
               mod_at_least_one_publicgoods,
               mod_at_least_one_high_domestic,
               mod_at_least_one_high_foreign),
          custom.coef.map = list_rename_coefs)


# save Table A08 as Word document 
htmlreg(list(mod_at_least_one_dist,
             mod_at_least_one_publicgoods,
             mod_at_least_one_high_domestic,
             mod_at_least_one_high_foreign),
        custom.model.names = c("(1) Distributive",
                               "(2) Public Goods",
                               "(3) High Policy (Domestic)",
                               "(4) High Policy (Foreign)"),
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_a08.html")



# run models in loop and plot predicted probabilities 

# broader Pekkannen et al. (2006) policy areas,
# distinguishing between domestic and foreign high policy areas
policy_areas <- unique(dat_reg$area_pekkanen2)

# empty data frame to store coefficients
dat_eff_all <- data.frame()

for (i in policy_areas) {
    
    dat_policyarea <- filter(dat_reg, area_pekkanen2 == i)
    
    mod_area <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                          type_elected  +
                          poly(won_before_times_max9, 2) +
                          female +
                          distance_abs_wf_mean + 
                          dynasty |
                          policy_area_harmonised + 
                          factor(year) + factor(region),
                      data = filter(dat_policyarea, type_posts == "Legislative Post (Combined)"),
                      vcov = vcov_cluster("filename"),
                      family = binomial(link = "logit"))
    
    dat_eff <- marginaleffects::plot_predictions(mod_area, draw = FALSE,           
                                         condition = c("prop_policyarea_relevant_bert_no_na"))
    
    dat_eff$policy_area <- i
    
    dat_eff_all <- bind_rows(dat_eff, dat_eff_all)
    
    
}


# get distribution of variable for x-axis
dat_pekkanen_dist2 <- dat_reg |> 
    filter(type_posts == "Legislative Post (Combined)") |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na, area_pekkanen2) |> 
    unique() |> 
    rename(policy_area = area_pekkanen2)

# relevel factor 
dat_pekkanen_dist2$policy_area <- factor(dat_pekkanen_dist2$policy_area,
                                         levels = c(
                                             "Distributive",
                                             "Public Goods",
                                             "High Policy (Foreign)",
                                             "High Policy (Domestic)"))


dat_eff_all$policy_area <- factor(dat_eff_all$policy_area,
                                  levels = c(
                                      "Distributive",
                                      "Public Goods",
                                      "High Policy (Foreign)",
                                      "High Policy (Domestic)"))

ggplot(data = dat_eff_all, aes(x = prop_policyarea_relevant_bert_no_na, 
                               y = estimate,
                               ymin = conf.low,
                               ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_line() +
    geom_rug(data = dat_pekkanen_dist2, aes(x = prop_policyarea_relevant_bert_no_na),
             inherit.aes = FALSE, 
             alpha = 0.8, length = unit(0.025, "npc"),
             linewidth = 0.1, colour = "grey10") +
    facet_wrap(~policy_area, nrow = 1, labeller = label_wrap_gen(width = 20)) +
    labs(x = "Manifesto Salience of Policy Area",
         y = "Pr(Post in Policy Area)")  +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 1, 0.2))) +
    scale_y_continuous(limits = c(0, 1.0),
                       breaks = c(seq(0, 1, 0.2)))
ggsave("fig_a09.pdf",
       width = 9, height = 6)


# Figure A10 ----

# jackknife-style models: 
# exclude one policy area at a time 

# empty data frame to store regression coefficients
tidy_jackknife_all <- data.frame()

for (i in unique(dat_reg$policy_area_harmonised)) {
    
    dat_jackknife_area <- dat_reg |> 
        filter(type_posts == "Legislative Post (Combined)") |> 
        filter(policy_area_harmonised != i)
    
    glm_jackknife <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                               type_elected  +
                               poly(won_before_times_max9, 2) +
                               
                               female +
                               distance_abs_wf_mean + 
                               dynasty |
                               factor(year) + factor(region),
                           data = dat_jackknife_area,
                           vcov = vcov_cluster("filename"),
                           family = binomial(link = "logit"))
    
    # tidy coefficients and store in data frame
    tidy_jackknife <- broom::tidy(glm_jackknife) |> 
        mutate(excluded_area = i)
    
    # bind all rows
    tidy_jackknife_all <- bind_rows(tidy_jackknife, tidy_jackknife_all)
    
    
}

# adjust variable for y-axis and mention which policy area is exluded
tidy_jackknife_policyarea <- filter(tidy_jackknife_all, str_detect(term, "prop_policy")) |> 
    mutate(excluded_area = paste0("Exclude: ", excluded_area))

# plot Figure A10
ggplot(tidy_jackknife_policyarea,
       aes(x = fct_rev(excluded_area),
           y = estimate,
           ymin = estimate - 1.96 * std.error,
           ymax = estimate + 1.96 * std.error)) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
    geom_point(size = 3) + 
    coord_flip() +
    scale_y_continuous(breaks = c(seq(-3, 3, 0.5))) +
    geom_linerange(linewidth = 0.8) +
    labs(y = "Coefficient of Manifesto Salience", x = NULL) +
    theme(legend.position = "none", strip.text = element_text(size = 15))
ggsave("fig_a10.pdf", width = 9, height = 4)


# Figure A11 and Table A09 ----
# separate models for each election

# store lists and data frames for coefficients
# and predictions
list_years <- list()
dat_pred_years <- data.frame()

# levels of election years for loop
years <- c("2003", "2005", "2009", "2012", "2014")

# run model for each year in a loop
for (i in years) {
    
    lm_year <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                         type_elected  +
                         poly(won_before_times_max9, 2) +
                         female +
                         distance_abs_wf_mean + 
                         dynasty |
                         policy_area_harmonised + factor(region), 
                     data = filter(dat_reg, type_posts == "Legislative Post (Combined)" 
                                   & year == i),
                     vcov = vcov_cluster("filename"),
                     family = binomial(link = "logit"))
    
    year <- as.factor(i)
    
    list_years[[paste("Year:", i, sep = " ")]] <- lm_year
    
    # get predicted probabilities for manifesto salience
    dat_eff <- marginaleffects::plot_predictions(lm_year, draw = FALSE,    
                                                 condition = c("prop_policyarea_relevant_bert_no_na"))
    
    # add election year
    dat_eff$year <- factor(i)
    
    # combine all predicted values in one data frame
    dat_pred_years <- bind_rows(dat_eff, dat_pred_years)
}


# print regression table
screenreg(list_years,
          custom.coef.map = list_rename_coefs)

# store Table A09 as Word document
htmlreg(list_years,
        custom.coef.map = list_rename_coefs, 
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_a09.html")


# get distribution of manifesto salience for each election year
# which is added to plot
port_year_type <- dat_reg |>
    filter(type_posts == "Legislative Post (Combined)") |> 
    dplyr::select(year, prop_policyarea_relevant_bert_no_na) |>
    unique()

# relevel factor
dat_pred_years <- dat_pred_years |> 
    mutate(year = factor(year, levels = c("2003", "2005", "2009", 
                                          "2012", "2014")))

# plot Figure A11 
ggplot(data = dat_pred_years, aes(x = prop_policyarea_relevant_bert_no_na, 
                                  y = estimate,
                                  ymin = conf.low,
                                  ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_line() +
    geom_rug(data = port_year_type, aes(x = prop_policyarea_relevant_bert_no_na),
             inherit.aes = FALSE, 
             alpha = 0.8, length = unit(0.025, "npc"),
             linewidth = 0.1, colour = "grey10") +
    facet_wrap(~year, nrow = 1, labeller = label_wrap_gen(width = 18)) +
    labs(x = "Manifesto Salience of Policy Area",
         y = "Pr(Post in Policy Area)")  +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 0.9, 0.3))) +
    scale_y_continuous(limits = c(-0.1, 1.1),
                       breaks = c(seq(0, 1, 0.2)))
ggsave("fig_a11.pdf", width = 9, height = 5)


# Figure A12 and Table A10 ----
# run models with various specifications

# get all of controls
controls <- "+ type_elected  +
                     poly(won_before_times_max9, 2) +
                      
                     female +
                     distance_abs_wf_mean + dynasty "

# all fixed effects
fixed_effects <- "| factor(policy_area_harmonised) + 
                     factor(year) + factor(region)"

# model without controls and without fixed effects
glm_nocontrols <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na")),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
) 

glm_nocontrols_tidy <- glm_nocontrols |> 
    tidy() |> 
    mutate(model = "M1: No Controls, No Fixed Effects")

glm_controls <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na",
                      controls)),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
) 

glm_controls_tidy <- glm_controls |> 
    tidy() |> 
    mutate(model = "M2: Controls, No Fixed Effects")


glm_controls_fe_policy <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na",
                      controls, " | factor(policy_area_harmonised)")),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
) 

glm_controls_fe_policy_tidy <- glm_controls_fe_policy |> 
    tidy() |> 
    mutate(model = "M3: Controls, FE: Policy Area")


glm_controls_fe_year <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na",
                      controls, " | factor(year)")),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
)

glm_controls_fe_year_tidy <- glm_controls_fe_year |> 
    tidy() |> 
    mutate(model = "M4: Controls, FE: Year")

glm_controls_fe_region <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na",
                      controls, " | factor(region)")),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
)

glm_controls_fe_region_tidy <- glm_controls_fe_region |> 
    tidy() |> 
    mutate(model = "M5: Controls, FE: Region")

glm_controls_fe_all <- feglm(
    as.formula(paste0("position_dummy ~ prop_policyarea_relevant_bert_no_na",
                      controls, fixed_effects)),
    family = binomial(link = "logit"),
    vcov = vcov_cluster("filename"),
    data = filter(dat_reg, type_posts == "Legislative Post (Combined)")
) 

glm_controls_fe_all_tidy <- glm_controls_fe_all |> 
    tidy() |> 
    mutate(model = "M6: Controls, FE: Policy Area, Year, Region")


# combine tidy coefficients of all models
models_all <- bind_rows(glm_controls_tidy,
                        glm_nocontrols_tidy,
                        glm_controls_fe_policy_tidy,
                        glm_controls_fe_year_tidy,
                        glm_controls_fe_region_tidy,
                        glm_controls_fe_all_tidy) |> 
    filter(str_detect(term, "prop_policy")) |> 
    mutate(main_model = ifelse(str_detect(model, "M6"), "Main Model", "Other Models"))


# plot Figure A12 
ggplot(models_all,
       aes(x = fct_rev(model), y = estimate,
           colour = main_model,
           shape = main_model,
           ymin = estimate - 1.96 * std.error,
           ymax = estimate + 1.96 * std.error)) +
    geom_point(size = 3.5) +
    geom_linerange(linewidth = 1.05) +
    scale_y_continuous(breaks = c(seq(-2, 8, 0.5))) +
    coord_flip() +
    scale_shape_manual(values = c(15, 16)) +
    scale_colour_manual(values = c("darkred", "grey30")) +
    labs(y =  "Coefficient of Manifesto Salience", x = NULL) +
    geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
    theme(legend.position = "none")
ggsave("fig_a12.pdf",  
       width = 9, height = 4)


## print egression table
screenreg(list(glm_controls,
               glm_nocontrols,
               glm_controls_fe_policy,
               glm_controls_fe_year,
               glm_controls_fe_region,
               glm_controls_fe_all),
          custom.coef.map = list_rename_coefs)


# save Table A10 as Word document 
htmlreg(list(glm_nocontrols, 
             glm_controls,
             glm_controls_fe_policy,
             glm_controls_fe_year,
             glm_controls_fe_region,
             glm_controls_fe_all),
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Deviance", "Log Likelihood", "Pseudo R2",
            "Fixed effects: Policy Area",
            "Fixed effects: Year",
            "Fixed effects: Region"
        ),
        file = "tab_a10.html")


# Figure A14 ----
# use count variable (count_bert) instead of proportions as 
# main independent variable

# empty data frame and list for regression models
# and predicted probabilities
dat_pred_count <- data.frame()
list_base_count <- list()

for (i in type_posts_vector) {
    
    lm_type_count <- feglm(position_dummy ~ count_bert  +
                               type_elected  +
                               poly(won_before_times_max9, 2) +
                               
                               female +
                               distance_abs_wf_mean + 
                               dynasty |
                               policy_area_harmonised + 
                               factor(year) + factor(region), 
                           data = filter(dat_reg, type_posts == i),
                           vcov = vcov_cluster("filename"),
                           family = binomial(link = "logit"))
    
    list_base_count[[i]] <- lm_type_count
    
    # get predicted probabilities
    dat_eff <- marginaleffects::plot_predictions(lm_type_count, draw = FALSE,                                                 condition = c("count_bert"))
    
    # store information on type of post
    dat_eff$type_post <- i
    
    # bind all predicted probabilities
    dat_pred_count <- bind_rows(dat_eff, dat_pred_count)
}

# print regression table
screenreg(list_base_count, custom.coef.map = list_rename_coefs)

# save Table A12 as Word document
htmlreg(list_base_count,
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_a12.html")


# relevel factor levels for predicted probability plots
dat_pred_count$type_post <- factor(dat_pred_count$type_post,
                                   levels = rev(levels_positions))


# get distribution of values for x-axis
port_count_type <- dat_reg |>
    dplyr::select(type_posts, count_bert) |>
    unique()

# plot Figure A14
ggplot(data = dat_pred_count, aes(x = count_bert, 
                                  y = estimate,
                                  ymin = conf.low,
                                  ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_rug(data = port_count_type, aes(x = count_bert),
             inherit.aes = FALSE, 
             alpha = 1, length = unit(0.025, "npc"),
             linewidth = 0.2, colour = "grey10") +
    geom_line() +
    coord_cartesian(ylim = c(0, 1)) +
    facet_wrap(~type_post, nrow = 1, labeller = label_wrap_gen(width = 18)) +
    labs(x = "Number of Statements About Policy Area",
         y = "Pr(Post in Policy Area)") 
ggsave("fig_a14.pdf", width = 9, height = 6)


# Figure A13 and Table A11 ----
# run ordinal logistic regression model based on count of posts

# get sum of positions
table(dat_several$sum_posts_area)

# run ordinal logistic regression model
lm_polr <- suppressWarnings(MASS::polr(factor(sum_posts_area) ~ prop_policyarea_relevant_bert_no_na  +
                          type_elected  +
                          poly(won_before_times_max9, 2) +
                          female +
                          distance_abs_wf_mean + 
                          dynasty +
                          policy_area_harmonised +  factor(year) + factor(region), 
                      Hess = TRUE,
                      data = dat_several))


# print regression model
# modelsummary(list("Model 1" = lm_polr))

# save Table A11 as Word document
modelsummary(list("Model 1" = lm_polr),
             output = "tab_a11.docx",
             statistic = NULL, estimate = "{estimate} ({std.error}){stars}",
             vcov = vcovCL(lm_polr, cluster = ~ filename), # cluster standard errors
             stars = c('*' = .05, '**' = .001, '***' = .0001),
             coef_omit = c("year|region|(Intercept)"),
             coef_map = c("0|1" = "0 Posts|1 Post",
                          "1|2" = "1 Post|2 Posts",
                          "2|3" = "2 Posts|3 Posts",
                          "prop_policyarea_relevant_bert_no_na" = "Manifesto Salience",
                          "area_pekkanenHigh Policy" = "Area: High Policy (ref.: Distributive)",
                          "area_pekkanenPublic Goods" = "Area: Public Goods",
                          "type_electedZombie" = "Elected: Zombie (ref.: SMD)",
                          "poly(won_before_times_max9, 2)1" = "Number of Terms",
                          "poly(won_before_times_max9, 2)2" = "Number of Terms (squared)",
                          "female1" = "Gender: Female (ref.: Male)",
                          "distance_abs_wf_mean" = "Ideological Distance from Party",
                          "dynasty" = "Dynasty",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenHigh Policy" = "Manifesto Salience x Area: High Policy",
                          "prop_policyarea_relevant_bert_no_na:area_pekkanenPublic Goods" = "Manifesto Salience x Area: Public Goods"))


# plot predicted probabilities with standard errors clustered by manifesto
pred_polr <- plot_predictions(lm_polr, condition = c("prop_policyarea_relevant_bert_no_na"), 
                              type = "probs",
                              vcov = vcovCL(lm_polr, cluster = ~ filename), draw = FALSE)

# get predicted values at specified values
# which are reported in the paper

# no post, no focus
pred_polr |> 
    filter(prop_policyarea_relevant_bert_no_na == 0) |> 
    filter(sum_posts_area == 0) |> 
    dplyr::select(estimate, conf.low, conf.high)

# no post, full focus
pred_polr |> 
    filter(prop_policyarea_relevant_bert_no_na == 1) |> 
    filter(sum_posts_area == 0) |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na, estimate, conf.low, conf.high)

# improve label for faces
pred_polr <- pred_polr |> 
    mutate(group = paste0("Number of Posts=", group))


# get distribution of variable for x-axis
dat_several_dist2 <- dat_several |> 
    dplyr::select(prop_policyarea_relevant_bert_no_na, sum_posts_area) |> 
    rename(group = sum_posts_area) |> 
    unique() |> 
    mutate(group = as.factor(group)) |> 
    mutate(group = paste0("Number of Posts=", group))


# plot Figure A13
ggplot(pred_polr, aes(x = prop_policyarea_relevant_bert_no_na,
                      y = estimate, ymin = conf.low, ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    labs(x = "Manifesto Salience of Policy Area",
         y = "Pr(0|1|2|3 Posts in Policy Area)")  +
    geom_line() +     
    geom_rug(data = dat_several_dist2, aes(x = prop_policyarea_relevant_bert_no_na),
             inherit.aes = FALSE,
             alpha = 0.8, length = unit(0.025, "npc"),
             linewidth = 0.1, colour = "grey10") +
    scale_x_continuous(limits = c(0, 1),
                       breaks = c(seq(0, 0.9, 0.3))) +
    scale_y_continuous(limits = c(0, 1)) +
    facet_wrap(~group, nrow = 1)
ggsave("fig_a13.pdf", width = 9, height = 5)


# Table A13 ----

# model controlling for previous position (lagged DV) 

# empty list and data frame for regression models
list_base_lag <- list()


# run models in loop for all types of posts
for (i in type_posts_vector) {
    
    dat_reg$position_dummy_lag <- factor(dat_reg$position_dummy_lag)
    lm_type_lag <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                             position_dummy_lag + 
                             type_elected  +
                             poly(won_before_times_max9, 2) +
                             
                             female +
                             distance_abs_wf_mean + 
                             dynasty |
                             policy_area_harmonised + 
                             factor(year) + factor(region), 
                         data = filter(dat_reg, type_posts == i),
                         vcov = vcov_cluster("filename"),
                         family = binomial(link = "logit"))
    
    # combine regression models in list
    list_base_lag[[i]] <- lm_type_lag
    
    
}

# print regression table
screenreg(list_base_lag, 
          custom.coef.map = list_rename_coefs)

# save Table A13 as Word document
htmlreg(list_base_lag,
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_a13.html")




# Table A14 ----
# models with varying levels of prior experience in parliament


# not elected before
glm_beforetimes0 <- feglm(position_dummy ~ 
                              prop_policyarea_relevant_bert_no_na  +
                              type_elected  +
                              female +
                              distance_abs_wf_mean + 
                              dynasty |
                              policy_area_harmonised + 
                              factor(year) + factor(region), 
                          data = filter(dat_reg, 
                                        type_posts == "Legislative Post (Combined)" & 
                                            won_before_times == 0),
                          vcov = vcov_cluster("filename"),
                          family = binomial(link = "logit"))

# elected once before
glm_beforetimes1 <- update(glm_beforetimes0,
                           data = filter(dat_reg, 
                                         type_posts == "Legislative Post (Combined)" & 
                                             won_before_times == 1))

# elected twice before
glm_beforetimes2 <- update(glm_beforetimes0,
                           data = filter(dat_reg, 
                                         type_posts == "Legislative Post (Combined)" & 
                                             won_before_times == 2))


# elected three or more times
glm_beforetimes3more <- update(glm_beforetimes0,
                               data = filter(dat_reg, 
                                             type_posts == "Legislative Post (Combined)" & 
                                                 won_before_times >= 3))

# full sample
glm_beforetimesfullsample <- update(glm_beforetimes0,
                                    data = filter(dat_reg, 
                                                  type_posts == "Legislative Post (Combined)"))

# print regression table
screenreg(list(glm_beforetimes0,
               glm_beforetimes1,
               glm_beforetimes2,
               glm_beforetimes3more,
               glm_beforetimesfullsample),
          custom.coef.map = list_rename_coefs)


# save Table A14 as Word document
htmlreg(list(glm_beforetimes0,
             glm_beforetimes1,
             glm_beforetimes2,
             glm_beforetimes3more,
             glm_beforetimesfullsample),
        custom.model.names = c("(1) First Time",
                               "(2) Second Time",
                               "(3) Third Time",
                               "(4) >=Four Times",
                               "(5) Full Sample"),
        custom.coef.map = list_rename_coefs,
        caption = NULL,
        single.row = TRUE,
        custom.gof.names = c(
            "Num. obs.",
            "Fixed effects: Policy Area",
            "Fixed effects: Election Year",
            "Fixed effects: Region",
            "Deviance", "Log Likelihood", "Pseudo R2"
        ),
        file = "tab_a14.html")


# Figure A15 ----

# Show predicted probabilities of obtaining post
# conditional on previous wins based on main model reported in Table 1

lm_at_least_one_plot <- feglm(position_dummy ~ prop_policyarea_relevant_bert_no_na  +
                                  type_elected  +
                                  poly(won_before_times_max9, 2) +
                                  female +
                                  distance_abs_wf_mean + 
                                  dynasty |
                                  policy_area_harmonised + 
                                  factor(year) + factor(region), 
                              data = filter(dat_reg, type_posts == "Legislative Post (Combined)"),
                              vcov = vcov_cluster("filename"),
                              family = binomial(link = "logit"))


# get predicted probabilities
p_elected <- marginaleffects::plot_predictions(
    lm_at_least_one_plot, draw = FALSE,                                            
    condition = c("won_before_times_max9"))


# plot Figure A15
ggplot(data = p_elected, aes(x = won_before_times_max9, 
                             y = estimate,
                             ymin = conf.low,
                             ymax = conf.high)) +
    geom_ribbon(fill = "grey80") +
    geom_line() +
    scale_x_continuous(breaks = c(seq(0, 9, 1)),
                       labels = c(0:8, ">=9")) +
    labs(x = "Number of Times Elected", y = "Pr(Obtaining Post in Policy Area)")
ggsave("fig_a15.pdf",
       width = 9, height = 5)
